# extra appendix figures (denoted below)
library(tidyverse)
library(data.table)
library(dtplyr)

source("pundits_functions.R")

set.seed(1111)

pundits_agg0 <- readr::read_csv("../data/pundits_parrot_aggregated_imp0.csv")

ideal.points_d1 <- readr::read_csv("../data/ideal_points_masked.csv")

# code to generate figure C.1 (won't run, as twitter handle is masked)
sampled_pundits_inspace <- 
  ideal.points_d1 %>%
  arrange(desc(ideal.point)) %>%
  mutate(ideo_rank = 1:n()) %>%
  mutate(lab_ten = ifelse(ideo_rank %in% seq(from = 1, to = max(ideo_rank), by = 20), twitter_handle, NA)) %>%
  ggplot()+
  geom_point(aes(x = ideal.point, y = ideo_rank))+
  geom_text(aes(x = ideal.point+2, y = ideo_rank,
                label = lab_ten))+
  labs(x = "First Latent Space Dimension",
       y = "Latent Space Dimension Rank (Lower = More Conservative)",
       title = "Pundits in Latent Space",
       subtitle = "Every 20th rank-ordered pundit labeled")+
  theme_jg()
ggsave(sampled_pundits_inspace, file = "../figures/samples_inspace.png", width = 8, height= 10)

# code to generate Figure D.1
# note: this section won't run due to unavailability of raw text data
fpath <- "../data/dictionary_subsets/full_sets/" # not available per terms of service
files <- list.files(path = fpath, pattern = "*.csv")
files <- files[-which(grepl("trump", files))]

bigdf <- bind_rows(lapply(files, function(x){
  subj <- gsub(".csv", "", x)
  
  smalldf <- readr::read_csv(paste0(fpath, x))
  smalldf$topic <- subj
  return(smalldf)
}))

# how many documents / users by concept
bigdf %>%
  group_by(topic) %>%
  summarise(n_tweets = length(unique(id)),
            n_users = length(unique(twitter_handle)))

tmat <- matrix(0, nrow = 22, ncol = 22)
rownames(tmat) <- gsub(".csv", "", files)
colnames(tmat) <- gsub(".csv", "", files)

for(i in 1:nrow(tmat)){
  for(j in 1:ncol(tmat)){
    source_topic <- bigdf %>% 
      filter(topic == rownames(tmat)[i])
    
    destination_topic <- bigdf %>%
      filter(topic == colnames(tmat)[j])
    
    tmat[i,j] <- mean(source_topic$id %in% destination_topic$id)
    
    # zero out diagonal
    if(i == j){
      tmat[i,j] <- 0
    }
  }
}

topic_refdf <- data.frame(formal_names = c("China","Class","Climate",
                                           "Conservative","Democrat",
                                           "Far Left","Far Right",
                                           "Gender","Guns","Health Care/Insurance",
                                           "Immigration","Iran","Israel","LGBT",
                                           "Liberal","Mueller","Progressive",
                                           "Race","Reproductive Health",
                                           "Republican", "Taxes and Spending",
                                           "Trade"),
                          topic = c('china','class','climate','conservative',
                                    'democrat',
                                    'far_left','far_right','gender','guns',
                                    'health_care_insurance','immigration',
                                    'iran','israel','lgbt','liberal',
                                    'mueller','progressive','race',
                                    'reproductive_health','republican',
                                    'taxes_spending','trade'))

rownames(tmat) <- topic_refdf$formal_names
colnames(tmat) <- topic_refdf$formal_names

ovelap_plot <- 
  reshape2::melt(tmat) %>%
  filter(!Var1 == Var2) %>%
  ggplot(aes(x = Var1, y = Var2, fill = value))+
  geom_tile()+
  scale_fill_gradient(name = "Overlap",
                      low = "white",
                      high = "black")+
  labs(x = "Source Topic",y = "Destination Topic",
       title = "Topic Overlap",
       subtitle = "Cells represent proportion of documents in source topic (x-axis) that also mention destination topic (y-axis)")+
  theme_jg()+
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
ggsave(ovelap_plot, file = "../figures/topic_overlap.png", width = 12, height =8)

# number of topics by doc
ntab <- 
  bigdf %>%
  group_by(id) %>%
  summarise(n = n()) %>%
  arrange(desc(n))
mean(ntab$n == 1)

dups <- unique(bigdf$id[duplicated(bigdf$id)])
mean(ntab$id %in% dups)


# code to generate figure G.1
space_ts <- 
  ideal.points_d1 %>%
  filter(!is.na(tweetscore)) %>%
  ggplot(aes(x = tweetscore, y = ideal.point))+
  geom_point()+
  labs(title = "Latent Space and TweetScore Comparison")+
  theme_jg()
ggsave(space_ts, file = "../figures/space_ts_cor.png", width = 8, height= 5)

# raw correlation
with(ideal.points_d1, cor(ideal.point, tweetscore, use = "pairwise.complete.obs"))